##            Date    Site N1 N1Error Filter Well  HFGN1
## 3930 2021-02-20 Madison NA      NA      2    1 253368
## 3931 2021-02-20 Madison NA      NA      2    2 252801
## 3932 2021-02-20 Madison NA      NA      2    3 173262
## 3933 2021-02-20 Madison NA      NA      3    1 167274
## 3934 2021-02-20 Madison NA      NA      3    2 126110
## 3935 2021-02-20 Madison NA      NA      3    3 120750

LIMS N1 agrees with HFG data implying that using it to find an underlying trend of the High frequency data should be faithful.

CompFullDF <- FullDF%>%
  filter(Date > StartHFG - 14,
          Date < EndHFG + 14)#Reduces to only data around the HFG data
          #HFGN1 < 1e7)


CompN1DF <- CompFullDF%>%#Remove rows with missing data 
  filter(!is.na(N1))
CompHFGDF <- CompFullDF%>%
  filter(!is.na(HFGN1))



CompPlot <- ggplot()+
  aes(x=Date)+
  geom_point(aes(y = HFGN1,color = Filter) , size = .1 , data = CompHFGDF) +
  geom_line(aes(y = N1,color = "Lims Data") , size = .5 , data = CompN1DF) +
  scale_y_log10() +
  facet_wrap(~Site) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplotly(CompPlot)

using smoothing found in the main story on the N1 data.

GroupLoess <- function(SmoothedSite){
  FilteredDF <- FullDF%>%
    filter(Site == SmoothedSite)%>%
    select(Site,Date,N1)
  FilteredDF$loessN1 <- loessFit(y=(FilteredDF$N1), 
                      x=FilteredDF$Date, #create loess fit of the data
                      span=.4, #span of .2 seems to give the best result, not rigorously chosen
                      iterations=2)$fitted#2 iterations remove some bad patterns
  return(FilteredDF)
}

FullLoessData <- bind_rows(lapply(HFGLocations,GroupLoess))%>%
  distinct()

FullDF2 <- inner_join(HFGDF , FullLoessData , by = c("Date","Site"))%>%
  filter(!is.na(loessN1))%>%
  mutate(Filter = as.factor(Filter))


CompPlot2 <-FullDF2%>%
  filter(HFGN1<4e7)%>%
  ggplot()+
  aes(x=Date)+
  geom_point(aes(y = HFGN1,color=Filter) , size=.1)+
  geom_line(aes(y = loessN1) , size=1)+
  scale_y_log10()+
  facet_wrap(~Site) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplotly(CompPlot2)

With the small amount of case data in the region the smoothing seems to generalize reasonably.

CompFullDF <- FullDF2%>%
  filter(Date > StartHFG - 21,
          Date < EndHFG + 14,
          HFGN1 < 1e7)


CompN1DF <- CompFullDF%>%
  filter(!is.na(loessN1))

CompCaseHFGDF <- HFGCaseDF%>%
  mutate(ReportedCases = ifelse(ReportedCases == -999, NA, ReportedCases))%>%
  filter(!is.na(ReportedCases))



CompPlot3 <- ggplot()+
  aes(x=Date)+
  geom_point(aes(y=ReportedCases,color="ReportedCases") , size=1 , data = CompCaseHFGDF)+
  geom_line(aes(y=loessN1/1e4,color="Lims Data") , size=1 , data = CompN1DF)+
  #scale_y_log10()+
  facet_wrap(~Site)+
  ggtitle("Simple shape analysis") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

CompPlot3

Once we remove the trend we have a dataset of errors. The resulting error is normaly distributed and its relationship with the trend is minimal.

CompFullDF <- FullDF2%>%
  filter(HFGN1 < 1e7)%>%
  mutate(TrendError = log(HFGN1 - loessN1))%>%
  filter(!is.na(TrendError))


ggplot()+
  geom_histogram(aes(x=TrendError) , size=1 , data = CompFullDF)+
  facet_wrap(~Site)

ggplot()+
  geom_point(aes(x=loessN1,y=TrendError) , size=1 , data = CompFullDF)+
  facet_wrap(~Site)+
  scale_x_log10() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot()+
  geom_point(aes(x=Date,y=TrendError) , size=1 , data = CompFullDF)+
  facet_wrap(~Site)

We can control for filter to filter difference in variance by taking the variance of the errors for each Day. The resulting data is log normal and does not significantly scale with N1.

CompFullDF2 <- CompFullDF%>% #exploring measured variance of three replicates
  group_by(Site,Date)%>%
  summarize(TechnicalVariance = var(TrendError), LogN1 = mean(log(N1)),N=n())%>%
  filter(N!=1)#variance cant be calculated from one sample



CompFullDF2%>% #looking at shape of Variance. Surprising it is log normal as it was
  ggplot() + #generated by taking log N1 already.
  aes(x=log(TechnicalVariance)) +
  geom_histogram(bins = 15) +
  facet_wrap(~Site)

CompFullDF2%>%#Relation between logN1 and the measured variance
  ggplot() + 
  aes(y=log(TechnicalVariance)) + 
  geom_point(aes(x=LogN1)) +
  facet_wrap(~Site)

From this we can calculate the variance to get a handle on the day to day fluctuations. We can either take the variance of the errors or we can take the mean of the sample variances. Both give similar answers and which one is better depends on what assumptions we want to make about the errors.

VarianceError <- var(CompFullDF$TrendError,na.rm = TRUE) # calculate variance of all the errors.
#Assumes same mean for all errors. This should be true as the value is generated
#by subtracting the mean from the original data.

MeanVariance <- weighted.mean(CompFullDF2$TechnicalVariance, CompFullDF2$N,na.rm = TRUE) #Weighted Mean
#of the variance. Does not assume each sample mean is equal but gives a larger
#variance


print(paste("Variance of Trend Errors:", round(VarianceError,4)))
## [1] "Variance of Trend Errors: 2.141"
print(paste("weighted mean of variances", round(MeanVariance,4)))
## [1] "weighted mean of variances 1.0365"